Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30107
## Number of samples: 80 (45 ASD, 35 controls)
Significance criteria: adjusted p-value<0.05
datExpr = datExpr %>% filter(DE_info$padj<0.05 & !is.na(DE_info$padj))
print(paste0(nrow(datExpr), ' genes left.'))
## [1] "3000 genes left."
Since there are more genes than samples, we can perform PCA and reduce the dimension from 30K to 80 without losing any information and use this for methods that take too long with the whole dataset
datExpr_t = datExpr %>% t
pca = datExpr_t %>% prcomp
datExpr_redDim = pca$x %>% data.frame
clusterings = list()
Chose k=3 as best number of clusters
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_t, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_t, best_k, nstart=25)
clusterings[['KMeans']] = datExpr_k_means$cluster
Chose k=7 as best number of clusters.
h_clusts = datExpr_t %>% dist %>% hclust %>% as.dendrogram
h_clusts %>% plot
abline(h=25, col='blue')
best_k = 7
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Colors:
Diagnosis: Blue=control, Green=ASD
Sex: Pink=Female, Blue=Male
Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital
Age: Purple=youngest, Yellow=oldest
clusterings[['HC']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_age = datMeta$Age %>% min
max_age = datMeta$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>%
mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'), # Pink Female, Blue Male
'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D', # ggplot defaults for 4 colours
Brain_lobe=='Temporal'~'#7CAE00',
Brain_lobe=='Parietal'~'#00BFC4',
Brain_lobe=='Occipital'~'#C77CFF'),
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Chose the best clustering to be with k=8, which is basically two big clusters and some outliers
*The rest of the output plots can be found in the Data/Gandal/consensusClustering/samples_DE_genes folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance (The 2 first components explain over 60% of the variance, so decided to keep the first 37 to accumulate 90% of the variance)
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign obs to genes with FDR<0.01 using the fdrtool package
Note: Using the PCA reduced matrix because the algorithm didn’t converge with the original dataset
It’s not supposed to be a good method for clustering samples because ICA does not perform well with small samples (see Figure 4 of this paper)
Still, it leaves only 7 samples without a cluster
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 6
## 7 36 23 10 3 1
# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
# geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
# theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
This method doesn’t work well:
Using power=5 as it is the first power that exceed the 0.85 threshold and the reduced version of the dataset because the original one never reaches the \(R^2\) threshold
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = 1:30)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.0873 -0.323 0.0532 1.60e+01 1.56e+01 2.64e+01
## 2 2 0.5580 -0.912 0.7120 4.65e+00 4.03e+00 1.11e+01
## 3 3 0.6670 -1.060 0.6750 1.64e+00 1.18e+00 5.18e+00
## 4 4 0.8300 -1.090 0.8940 6.53e-01 3.97e-01 2.58e+00
## 5 5 0.8970 -1.070 0.9420 2.85e-01 1.48e-01 1.34e+00
## 6 6 0.8970 -1.050 0.9570 1.33e-01 6.02e-02 7.17e-01
## 7 7 0.9240 -1.060 0.9560 6.53e-02 2.62e-02 3.91e-01
## 8 8 0.9440 -1.040 0.9700 3.35e-02 1.06e-02 2.16e-01
## 9 9 0.1710 -2.070 -0.0377 1.78e-02 4.29e-03 1.21e-01
## 10 10 0.1670 -2.010 -0.0484 9.77e-03 1.87e-03 6.86e-02
## 11 11 0.1610 -1.900 -0.0620 5.50e-03 8.30e-04 3.92e-02
## 12 12 0.1770 -2.610 -0.0381 3.16e-03 3.73e-04 2.25e-02
## 13 13 0.1410 -2.230 -0.1030 1.85e-03 1.70e-04 1.32e-02
## 14 14 0.7960 -0.898 0.8180 1.11e-03 7.80e-05 8.34e-03
## 15 15 0.1060 -1.460 -0.1200 6.69e-04 3.61e-05 5.33e-03
## 16 16 0.7870 -0.865 0.8090 4.10e-04 1.69e-05 3.44e-03
## 17 17 0.8150 -0.870 0.8420 2.55e-04 7.74e-06 2.24e-03
## 18 18 0.0489 -0.916 -0.0662 1.60e-04 3.45e-06 1.48e-03
## 19 19 0.0823 -1.600 -0.0938 1.01e-04 1.54e-06 9.78e-04
## 20 20 0.1180 -1.840 -0.1320 6.43e-05 6.86e-07 6.52e-04
## 21 21 0.1890 -1.680 0.0398 4.13e-05 3.07e-07 4.37e-04
## 22 22 0.1900 -1.700 0.0396 2.67e-05 1.38e-07 2.94e-04
## 23 23 0.1960 -2.360 0.0109 1.73e-05 6.20e-08 1.98e-04
## 24 24 0.1940 -2.300 0.0247 1.13e-05 2.79e-08 1.34e-04
## 25 25 0.2140 -2.680 0.0094 7.41e-06 1.26e-08 9.13e-05
## 26 26 0.2210 -2.640 0.0197 4.87e-06 5.70e-09 6.21e-05
## 27 27 0.1920 -2.590 -0.0360 3.22e-06 2.58e-09 4.23e-05
## 28 28 0.2100 -2.430 0.0107 2.13e-06 1.17e-09 2.89e-05
## 29 29 0.2230 -2.780 0.0014 1.42e-06 5.31e-10 1.98e-05
## 30 30 0.2300 -2.740 0.0104 9.42e-07 2.40e-10 1.35e-05
Running WGCNA with power=5
# Best power
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 0, 1
## ..there is nothing to merge.
Cluster distribution
table(network$colors)
##
## 0 1
## 41 39
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
Using the original expression dataset, the \(R^2\) threshold is never achieved, getting closest at power 1, but still doesn’t manage to find any clusters within the data
best_power = datExpr %>% pickSoftThreshold(powerVector = 1:30)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.298 666.0 0.1110 77.9 77.9 78.2
## 2 2 0.297 333.0 0.1090 76.7 76.9 77.3
## 3 3 0.297 222.0 0.1090 75.6 75.8 76.5
## 4 4 0.296 167.0 0.1070 74.6 74.8 75.7
## 5 5 0.298 134.0 0.1080 73.5 73.8 74.9
## 6 6 0.297 112.0 0.1070 72.4 72.8 74.2
## 7 7 0.297 95.5 0.1060 71.4 71.9 73.4
## 8 8 0.296 83.5 0.1050 70.4 70.9 72.6
## 9 9 0.295 74.2 0.1030 69.4 70.0 71.9
## 10 10 0.298 67.3 0.1050 68.4 69.0 71.1
## 11 11 0.298 61.2 0.1040 67.4 68.1 70.4
## 12 12 0.297 56.1 0.1020 66.5 67.2 69.6
## 13 13 0.292 51.2 0.0976 65.5 66.3 68.9
## 14 14 0.292 47.5 0.0967 64.6 65.4 68.2
## 15 15 0.291 44.3 0.0955 63.7 64.6 67.5
## 16 16 0.290 41.5 0.0941 62.8 63.7 66.8
## 17 17 0.289 39.1 0.0927 61.9 62.9 66.1
## 18 18 0.288 36.9 0.0914 61.1 62.1 65.4
## 19 19 0.288 34.9 0.0900 60.2 61.2 64.7
## 20 20 0.287 33.2 0.0886 59.4 60.4 64.0
## 21 21 0.286 31.7 0.0883 58.6 59.6 63.4
## 22 22 0.285 30.2 0.0870 57.7 58.9 62.7
## 23 23 0.285 28.9 0.0855 56.9 58.1 62.1
## 24 24 0.284 27.7 0.0849 56.1 57.3 61.4
## 25 25 0.282 26.8 0.0835 55.4 56.6 60.8
## 26 26 0.281 25.8 0.0820 54.6 55.9 60.2
## 27 27 0.280 24.7 0.0802 53.9 55.1 59.6
## 28 28 0.279 23.9 0.0787 53.1 54.4 58.9
## 29 29 0.279 23.0 0.0776 52.4 53.7 58.3
## 30 30 0.278 22.3 0.0762 51.7 53.0 57.7
Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 groups following the best k from K-means because the methods are similar
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 3 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, best_k, ICA_output, ICA_clusters_names,
signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta,
best_power, c, viridis_age_cols, create_viridis_dict, pca)
Using Adjusted Rand Index:
K-means, Hierarchical Clustering Consensus Clustering and Gaussian Mixtures seem to give very similar clusterings
ASD seems to be captured best by K-Means and Consensus Clustering followed by Hierarchical clustering and Gaussian Mixtures
No clusterings were able to capture any other phenotype feature
clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['Region']] = datMeta$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta$Sex
clusters_plus_phenotype[['Age']] = datMeta$Age
clusters_plus_phenotype[['Subject']] = datMeta$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta$Diagnosis_
cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
cluster1 = as.factor(clusters_plus_phenotype[[i]])
for(j in (i):length(clusters_plus_phenotype)){
cluster2 = as.factor(clusters_plus_phenotype[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), Subject_ID = datMeta$Subject_ID,
KMeans = as.factor(clusterings[['KMeans']]), Hierarchical = as.factor(clusterings[['HC']]),
Consensus = as.factor(clusterings[['CC']]), ICA = as.factor(clusterings[['ICA']]),
GMM = as.factor(clusterings[['GMM']]), WGCNA = as.factor(clusterings[['WGCNA']]),
Sex = as.factor(datMeta$Sex), Region = as.factor(datMeta$Brain_lobe),
Diagnosis = as.factor(datMeta$Diagnosis_), Age = datMeta$Age)
Now, PC1 seems to separate samples by Diagnosis pretty well
selectable_scatter_plot(plot_points, plot_points)